### Reading the dataset

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
import os
os.chdir('G:\FROM DESKTOP 11_07_23\ETL Hive\British Airways')

In [5]:
import pandas as pd
df = pd.read_csv('customer_booking.csv', encoding = 'unicode_escape')
df.head()

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,5.52,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   num_passengers         50000 non-null  int64  
 1   sales_channel          50000 non-null  object 
 2   trip_type              50000 non-null  object 
 3   purchase_lead          50000 non-null  int64  
 4   length_of_stay         50000 non-null  int64  
 5   flight_hour            50000 non-null  int64  
 6   flight_day             50000 non-null  object 
 7   route                  50000 non-null  object 
 8   booking_origin         50000 non-null  object 
 9   wants_extra_baggage    50000 non-null  int64  
 10  wants_preferred_seat   50000 non-null  int64  
 11  wants_in_flight_meals  50000 non-null  int64  
 12  flight_duration        50000 non-null  float64
 13  booking_complete       50000 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.3+ 

In [7]:
df.isna().sum()

num_passengers           0
sales_channel            0
trip_type                0
purchase_lead            0
length_of_stay           0
flight_hour              0
flight_day               0
route                    0
booking_origin           0
wants_extra_baggage      0
wants_preferred_seat     0
wants_in_flight_meals    0
flight_duration          0
booking_complete         0
dtype: int64

In [8]:
df.duplicated().sum()

719

In [9]:
X = df.drop(columns=['wants_in_flight_meals'])
Y = df[['wants_in_flight_meals']]

In [10]:
X.head()

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,5.52,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,5.52,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,5.52,0


In [11]:
Y.head()

Unnamed: 0,wants_in_flight_meals
0,0
1,0
2,0
3,1
4,1


### Cat Con Sep

In [12]:
from functionpackage import catconsep

In [13]:
cat, con=catconsep(X)

In [14]:
cat

['sales_channel', 'trip_type', 'flight_day', 'route', 'booking_origin']

In [15]:
con

['num_passengers',
 'purchase_lead',
 'length_of_stay',
 'flight_hour',
 'wants_extra_baggage',
 'wants_preferred_seat',
 'flight_duration',
 'booking_complete']

### Create preprocessing pipeline for X

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [54]:
num_pipe = Pipeline(steps=[('impute', SimpleImputer(strategy='mean')),
                           ('scaler', StandardScaler())])

In [55]:
cat_pipe = Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')),
                           ('ohe', OneHotEncoder(handle_unknown='ignore'))])

In [56]:
pre = ColumnTransformer([('num', num_pipe, con),
                        ('cat', cat_pipe, cat)])

In [57]:
pre

In [60]:
X_pre = pre.fit_transform(X).toarray()
X_pre

array([[ 0.40068439,  1.95753005, -0.11935316, ...,  0.        ,
         0.        ,  0.        ],
       [-0.57955926,  0.29916394, -0.08984361, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.40068439,  1.74747034, -0.0308245 , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.57955926, -0.67374418, -0.50297735, ...,  0.        ,
         0.        ,  0.        ],
       [-0.57955926, -0.77324614, -0.50297735, ...,  0.        ,
         0.        ,  0.        ],
       [-0.57955926, -0.72902305, -0.50297735, ...,  0.        ,
         0.        ,  0.        ]])

In [61]:
cols = pre.get_feature_names_out()
cols

array(['num__num_passengers', 'num__purchase_lead', 'num__length_of_stay',
       'num__flight_hour', 'num__wants_extra_baggage',
       'num__wants_preferred_seat', 'num__flight_duration',
       'num__booking_complete', 'cat__sales_channel_Internet',
       'cat__sales_channel_Mobile', 'cat__trip_type_CircleTrip',
       'cat__trip_type_OneWay', 'cat__trip_type_RoundTrip',
       'cat__flight_day_Fri', 'cat__flight_day_Mon',
       'cat__flight_day_Sat', 'cat__flight_day_Sun',
       'cat__flight_day_Thu', 'cat__flight_day_Tue',
       'cat__flight_day_Wed', 'cat__route_AKLDEL', 'cat__route_AKLHGH',
       'cat__route_AKLHND', 'cat__route_AKLICN', 'cat__route_AKLKIX',
       'cat__route_AKLKTM', 'cat__route_AKLKUL', 'cat__route_AKLMRU',
       'cat__route_AKLPEK', 'cat__route_AKLPVG', 'cat__route_AKLTPE',
       'cat__route_AORICN', 'cat__route_AORKIX', 'cat__route_AORKTM',
       'cat__route_AORMEL', 'cat__route_AORPER', 'cat__route_AORPUS',
       'cat__route_BBIMEL', 'cat__route_B

In [62]:
X_pre = pd.DataFrame(X_pre, columns=cols)
X_pre.head()

Unnamed: 0,num__num_passengers,num__purchase_lead,num__length_of_stay,num__flight_hour,num__wants_extra_baggage,num__wants_preferred_seat,num__flight_duration,num__booking_complete,cat__sales_channel_Internet,cat__sales_channel_Mobile,...,cat__booking_origin_Timor-Leste,cat__booking_origin_Tonga,cat__booking_origin_Tunisia,cat__booking_origin_Turkey,cat__booking_origin_Ukraine,cat__booking_origin_United Arab Emirates,cat__booking_origin_United Kingdom,cat__booking_origin_United States,cat__booking_origin_Vanuatu,cat__booking_origin_Vietnam
0,0.400684,1.95753,-0.119353,-0.381764,0.703747,-0.649919,-1.174175,-0.419359,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.579559,0.299164,-0.089844,-1.12078,-1.420965,-0.649919,-1.174175,-0.419359,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.400684,1.74747,-0.030824,1.465775,0.703747,1.538654,-1.174175,-0.419359,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.579559,0.122272,0.234761,-0.936026,-1.420965,-0.649919,-1.174175,-0.419359,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.400684,-0.18729,-0.030824,1.096267,0.703747,-0.649919,-1.174175,-0.419359,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Applying train test split

In [63]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_pre, Y, test_size=0.2, random_state=10)

In [64]:
xtrain.head()

Unnamed: 0,num__num_passengers,num__purchase_lead,num__length_of_stay,num__flight_hour,num__wants_extra_baggage,num__wants_preferred_seat,num__flight_duration,num__booking_complete,cat__sales_channel_Internet,cat__sales_channel_Mobile,...,cat__booking_origin_Timor-Leste,cat__booking_origin_Tonga,cat__booking_origin_Tunisia,cat__booking_origin_Turkey,cat__booking_origin_Ukraine,cat__booking_origin_United Arab Emirates,cat__booking_origin_United Kingdom,cat__booking_origin_United States,cat__booking_origin_Vanuatu,cat__booking_origin_Vietnam
20433,-0.579559,-0.861692,1.120048,0.542005,0.703747,1.538654,1.037139,-0.419359,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28866,-0.579559,1.09518,-0.591506,-0.566518,-1.420965,-0.649919,-1.107368,2.384592,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42817,0.400684,-0.872748,-0.502977,-0.19701,0.703747,1.538654,-0.572911,-0.419359,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9543,-0.579559,-0.331015,0.205252,-1.675042,0.703747,1.538654,1.037139,-0.419359,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20619,0.400684,-0.740079,0.411819,0.726759,0.703747,-0.649919,1.037139,-0.419359,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
xtest.head()

Unnamed: 0,num__num_passengers,num__purchase_lead,num__length_of_stay,num__flight_hour,num__wants_extra_baggage,num__wants_preferred_seat,num__flight_duration,num__booking_complete,cat__sales_channel_Internet,cat__sales_channel_Mobile,...,cat__booking_origin_Timor-Leste,cat__booking_origin_Tonga,cat__booking_origin_Tunisia,cat__booking_origin_Turkey,cat__booking_origin_Ukraine,cat__booking_origin_United Arab Emirates,cat__booking_origin_United Kingdom,cat__booking_origin_United States,cat__booking_origin_Vanuatu,cat__booking_origin_Vietnam
27632,-0.579559,-0.1099,-0.591506,0.911513,-1.420965,-0.649919,-0.18543,2.384592,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36119,-0.579559,-0.751135,-0.532487,1.835282,-1.420965,1.538654,-1.107368,-0.419359,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4796,0.400684,0.066993,-0.001315,-1.12078,0.703747,-0.649919,1.037139,-0.419359,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3648,-0.579559,0.564503,0.234761,0.542005,0.703747,-0.649919,1.037139,-0.419359,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24501,-0.579559,-0.828525,0.175742,-1.12078,-1.420965,-0.649919,0.870121,-0.419359,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
ytrain.head()

Unnamed: 0,wants_in_flight_meals
20433,1
28866,0
42817,0
9543,1
20619,1


In [67]:
ytest.head()

Unnamed: 0,wants_in_flight_meals
27632,0
36119,0
4796,0
3648,1
24501,0


In [69]:
xtrain.shape

(40000, 923)

In [70]:
ytrain.shape

(40000, 1)

In [71]:
xtest.shape

(10000, 923)

In [72]:
ytest.shape

(10000, 1)

### Check results for Random Forest

In [73]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [74]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

In [75]:
# Train the model on the training data
rf_classifier.fit(xtrain, ytrain.values.ravel())

In [76]:
# Predictions on the testing set
predictions = rf_classifier.predict(xtest)


In [77]:
# Evaluate the model
accuracy = accuracy_score(ytest, predictions)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.69


In [78]:
# Additional evaluation metrics
print("\nClassification Report:")
print(classification_report(ytest, predictions))


Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.79      0.75      5872
           1       0.65      0.55      0.60      4128

    accuracy                           0.69     10000
   macro avg       0.68      0.67      0.67     10000
weighted avg       0.69      0.69      0.69     10000

