In [70]:
!pip install chardet
import chardet
import pandas as pd
import numpy as np



In [71]:
# Defining function to detect encoding
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        rawdata = f.read()
    result = chardet.detect(rawdata)
    return result['encoding']

# File path
file_path = r'C:\Users\bryan\Desktop\FORAGE\CUSTOMER BOOKINGS.csv'

# Detect encoding
encoding = detect_encoding(file_path)

# Read CSV file with detected encoding
df = pd.read_csv(file_path, encoding=encoding)

# Display the DataFrame
df

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,6,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,6,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,6,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,6,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,Internet,RoundTrip,27,6,9,Sat,PERPNH,Australia,1,0,1,6,0
49996,1,Internet,RoundTrip,111,6,4,Sun,PERPNH,Australia,0,0,0,6,0
49997,1,Internet,RoundTrip,24,6,22,Sat,PERPNH,Australia,0,0,1,6,0
49998,1,Internet,RoundTrip,15,6,11,Mon,PERPNH,Australia,1,0,1,6,0


In [72]:
df.head()

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,6,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,6,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,6,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,6,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,6,0


In [73]:
#checking the shape of the dataframe
df.shape

(50000, 14)

In [74]:
#checking if the data has any null/missing values
df.isnull().sum()

num_passengers           0
sales_channel            0
trip_type                0
purchase_lead            0
length_of_stay           0
flight_hour              0
flight_day               0
route                    0
booking_origin           0
wants_extra_baggage      0
wants_preferred_seat     0
wants_in_flight_meals    0
flight_duration          0
booking_complete         0
dtype: int64

In [75]:
#checking the data dypes of the features
df.dtypes

num_passengers            int64
sales_channel            object
trip_type                object
purchase_lead             int64
length_of_stay            int64
flight_hour               int64
flight_day               object
route                    object
booking_origin           object
wants_extra_baggage       int64
wants_preferred_seat      int64
wants_in_flight_meals     int64
flight_duration           int64
booking_complete          int64
dtype: object

In [76]:
#checking the cloumns/features in the dataframe
df.columns

Index(['num_passengers', 'sales_channel', 'trip_type', 'purchase_lead',
       'length_of_stay', 'flight_hour', 'flight_day', 'route',
       'booking_origin', 'wants_extra_baggage', 'wants_preferred_seat',
       'wants_in_flight_meals', 'flight_duration', 'booking_complete'],
      dtype='object')

In [77]:
#descriptive statistics for the specific columns with integer data types
columns_of_interest = ['num_passengers', 'purchase_lead','length_of_stay', 'flight_hour','wants_extra_baggage','wants_preferred_seat','wants_in_flight_meals','flight_duration','booking_complete']
description_specific = df[columns_of_interest].describe()
print(description_specific)

       num_passengers  purchase_lead  length_of_stay  flight_hour  \
count    50000.000000   50000.000000     50000.00000  50000.00000   
mean         1.591240      84.940480        23.04456      9.06634   
std          1.020165      90.451378        33.88767      5.41266   
min          1.000000       0.000000         0.00000      0.00000   
25%          1.000000      21.000000         5.00000      5.00000   
50%          1.000000      51.000000        17.00000      9.00000   
75%          2.000000     115.000000        28.00000     13.00000   
max          9.000000     867.000000       778.00000     23.00000   

       wants_extra_baggage  wants_preferred_seat  wants_in_flight_meals  \
count         50000.000000          50000.000000           50000.000000   
mean              0.668780              0.296960               0.427140   
std               0.470657              0.456923               0.494668   
min               0.000000              0.000000               0.000000   
25%

In [78]:
#importing the necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [79]:
#converting categorical columns to numerical 
categorical_cols = ['sales_channel', 'trip_type', 'flight_day', 'route', 'booking_origin']

df = pd.get_dummies(df, columns=categorical_cols)

In [80]:
print(df.columns)

Index(['num_passengers', 'purchase_lead', 'length_of_stay', 'flight_hour',
       'wants_extra_baggage', 'wants_preferred_seat', 'wants_in_flight_meals',
       'flight_duration', 'booking_complete', 'sales_channel_Internet',
       ...
       'booking_origin_Timor-Leste', 'booking_origin_Tonga',
       'booking_origin_Tunisia', 'booking_origin_Turkey',
       'booking_origin_Ukraine', 'booking_origin_United Arab Emirates',
       'booking_origin_United Kingdom', 'booking_origin_United States',
       'booking_origin_Vanuatu', 'booking_origin_Vietnam'],
      dtype='object', length=924)


In [81]:
# Defining features (X) and target variable (y)
X = df.drop('booking_complete', axis=1)
y = df['booking_complete']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [82]:
# Initializing the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fitting the model on the training data
rf_model.fit(X_train, y_train)

In [83]:
# Predicting on the test set
y_pred = rf_model.predict(X_test)

# Evaluating model performance e.g., accuracy
accuracy = rf_model.score(X_test, y_test)
print(f"Accuracy of the Random Forest model: {accuracy}")

Accuracy of the Random Forest model: 0.856


In [84]:
# Getting feature importances
feature_importances = rf_model.feature_importances_

# Matching feature importances with column names
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("Feature importance:")
print(feature_importance_df)

Feature importance:
                             Feature  Importance
1                      purchase_lead    0.146490
3                        flight_hour    0.121401
2                     length_of_stay    0.111863
0                     num_passengers    0.046613
870          booking_origin_Malaysia    0.027659
..                               ...         ...
886            booking_origin_Panama    0.000000
887  booking_origin_Papua New Guinea    0.000000
827           booking_origin_Belarus    0.000000
162                     route_CMBCTS    0.000000
264                     route_DACPEK    0.000000

[923 rows x 2 columns]


In [85]:
#doing cross-validation on the model performance
from sklearn.model_selection import cross_val_score, train_test_split
scores = cross_val_score(rf_model, X, y, cv=5)  # cv=5 for 5-fold cross-validation
print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())


Cross-validation scores: [0.8507 0.7783 0.7267 0.5421 0.7763]
Mean accuracy: 0.73482


In [86]:
#doing the precision test on the model performance
from sklearn.metrics import precision_score, classification_report
# Assuming 'X' and 'y' are already defined and rf_model is trained
y_pred = rf_model.predict(X)
precision = precision_score(y, y_pred)
print(f"Precision: {precision:.4f}")
print(classification_report(y, y_pred))

Precision: 0.9751
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     42522
           1       0.98      0.83      0.90      7478

    accuracy                           0.97     50000
   macro avg       0.97      0.91      0.94     50000
weighted avg       0.97      0.97      0.97     50000

