In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [37]:
# Load the data from the CSV file
df = pd.read_csv('train_data_mod.csv')
# df = pd.read_csv('originalData/train_data.csv')


In [38]:
# Scale the numerical variables
scaler = StandardScaler()
df[['LeadTime', 'NumWeekendNights', 'NumWeekNights', 'NumAdults', 'NumChildren', 
    'RepeatedGuest', 'NumPrevCancellations', 'NumPreviousNonCancelled', 
    'AvgRoomPrice', 'SpecialRequests']] = scaler.fit_transform(df[['LeadTime', 'NumWeekendNights', 
                                                                     'NumWeekNights', 'NumAdults', 'NumChildren', 
                                                                     'RepeatedGuest', 'NumPrevCancellations', 
                                                                     'NumPreviousNonCancelled', 'AvgRoomPrice', 
                                                                     'SpecialRequests']])

In [39]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['BookingStatus','BookingID', 'NumPrevCancellations','NumPreviousNonCancelled','RepeatedGuest'], axis=1), df['BookingStatus'], 
                                                    test_size=0.3, random_state=42)


In [40]:
# Train the logistic regression model
# lr = LogisticRegression(random_state=42)
lr = RandomForestClassifier(random_state=42)
# lr = DecisionTreeClassifier(random_state=42)
lr.fit(X_train, y_train)

In [41]:
# Evaluate the performance of the model
y_pred = lr.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))


Accuracy: 0.8991500114863312
Precision: 0.9118520964014526
Recall: 0.9413769597818678
F1 Score: 0.9263793392587623
Confusion Matrix:
 [[2304  534]
 [ 344 5524]]


In [42]:
importance = lr.feature_importances_

# Sort feature importance in descending order
sorted_importance = sorted(zip(importance, X_train.columns), reverse=True)

# Print feature importance in descending order
for i, f in enumerate(sorted_importance):
    print("{:2d}) {:<30} {:.2%}".format(i + 1, f[1], f[0]))

 1) LeadTime                       31.73%
 2) AvgRoomPrice                   16.25%
 3) SpecialRequests                9.80%
 4) ArrivalDate                    9.29%
 5) ArrivalMonth                   8.26%
 6) NumWeekNights                  5.35%
 7) MarketSegment                  5.17%
 8) NumWeekendNights               3.78%
 9) ArrivalYear                    2.62%
10) NumAdults                      2.48%
11) MealPlan                       2.22%
12) RoomType                       1.65%
13) NumChildren                    0.82%
14) Parking                        0.58%
