In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

plt.rcParams.update({'font.size' : 14})

In [2]:
df = pd.read_csv('filtered_customer_booking.csv', index_col = 0 )

In [3]:
df = df.reset_index(drop=True)
df

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,6,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,6,AKLDEL,New Zealand,0,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,3,AKLDEL,India,1,1,0,5.52,0
3,1,Internet,RoundTrip,96,31,4,6,AKLDEL,New Zealand,0,0,1,5.52,0
4,2,Internet,RoundTrip,68,22,15,3,AKLDEL,India,1,0,1,5.52,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49977,2,Internet,RoundTrip,27,6,9,6,PERPNH,Australia,1,0,1,5.62,0
49978,1,Internet,RoundTrip,111,6,4,7,PERPNH,Australia,0,0,0,5.62,0
49979,1,Internet,RoundTrip,24,6,22,6,PERPNH,Australia,0,0,1,5.62,0
49980,1,Internet,RoundTrip,15,6,11,1,PERPNH,Australia,1,0,1,5.62,0


We will one hot encode the categorical data.
"One-hot encoding" is a data preprocessing technique used in machine learning and data analysis to convert categorical data (data that represents categories or labels) into a numerical format that can be used for modeling. It's commonly applied to CSV files or other tabular data.

## Model

In [4]:
df_final = df

In [5]:
from sklearn.preprocessing import OneHotEncoder

# Create the instance of the OHE
encoder = OneHotEncoder(handle_unknown= 'ignore')
# handle_unknown='ignore': This parameter configuration specifies how the encoder should handle new or unknown categories that it hasn't seen during the training phase.

#one hot encode 'sales_channel'
encoder_df = pd.DataFrame(encoder.fit_transform(df[["sales_channel"]]).toarray())
encoder_df = encoder_df.rename(columns={ 0 : 'Internet', 1 : 'Mobile'})
df_final = df_final.join(encoder_df)

In [6]:
#one hot encode trip type
encoder_df = pd.DataFrame(encoder.fit_transform(df[['trip_type']]).toarray())
encoder_df = encoder_df.rename(columns = {0 : 'Round Trip', 1 : 'One Way Trip', 2 : 'Circle Trip'})
df_final = df_final.join(encoder_df)

In [7]:
# We can now drop categorical columns
df_final = df_final.drop(columns = ['trip_type', 'sales_channel', 'booking_origin', 'route'])
df_final

Unnamed: 0,num_passengers,purchase_lead,length_of_stay,flight_hour,flight_day,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete,Internet,Mobile,Round Trip,One Way Trip,Circle Trip
0,2,262,19,7,6,1,0,0,5.52,0,1.0,0.0,0.0,0.0,1.0
1,1,112,20,3,6,0,0,0,5.52,0,1.0,0.0,0.0,0.0,1.0
2,2,243,22,17,3,1,1,0,5.52,0,1.0,0.0,0.0,0.0,1.0
3,1,96,31,4,6,0,0,1,5.52,0,1.0,0.0,0.0,0.0,1.0
4,2,68,22,15,3,1,0,1,5.52,0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49977,2,27,6,9,6,1,0,1,5.62,0,1.0,0.0,0.0,0.0,1.0
49978,1,111,6,4,7,0,0,0,5.62,0,1.0,0.0,0.0,0.0,1.0
49979,1,24,6,22,6,0,0,1,5.62,0,1.0,0.0,0.0,0.0,1.0
49980,1,15,6,11,1,1,0,1,5.62,0,1.0,0.0,0.0,0.0,1.0


In [8]:
# We will store the column 'booking_complete' for supervised learning
label = df.booking_complete
df_final = df_final.drop(columns =['booking_complete'])
df_final

Unnamed: 0,num_passengers,purchase_lead,length_of_stay,flight_hour,flight_day,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,Internet,Mobile,Round Trip,One Way Trip,Circle Trip
0,2,262,19,7,6,1,0,0,5.52,1.0,0.0,0.0,0.0,1.0
1,1,112,20,3,6,0,0,0,5.52,1.0,0.0,0.0,0.0,1.0
2,2,243,22,17,3,1,1,0,5.52,1.0,0.0,0.0,0.0,1.0
3,1,96,31,4,6,0,0,1,5.52,1.0,0.0,0.0,0.0,1.0
4,2,68,22,15,3,1,0,1,5.52,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49977,2,27,6,9,6,1,0,1,5.62,1.0,0.0,0.0,0.0,1.0
49978,1,111,6,4,7,0,0,0,5.62,1.0,0.0,0.0,0.0,1.0
49979,1,24,6,22,6,0,0,1,5.62,1.0,0.0,0.0,0.0,1.0
49980,1,15,6,11,1,1,0,1,5.62,1.0,0.0,0.0,0.0,1.0


## Normalizing the values
Scaling numerical features to a common range or distribution. This ensures that all features have a consistent scale, preventing any single feature from dominating the learning process due to its magnitude.

In [9]:
from sklearn.preprocessing import StandardScaler

#create a scaler object
scaler = StandardScaler()

#fit and transform the data
scaled_df = scaler.fit_transform(df_final)