In [85]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [86]:
# Load the data from the CSV file
df = pd.read_csv('train_data_mod.csv')
# df = pd.read_csv('originalData/train_data.csv')


In [87]:
def dayOfYear(month, day, year):
    daysInMonth = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    dayOfYear = sum(daysInMonth[:int(month)-1]) + int(day)
    if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0) and dayOfYear > 59:
        dayOfYear -= 1
    return dayOfYear

def preprocessing(df):
    
    df['ArrivalDayOfYear'] = df.apply(lambda row: dayOfYear(row['ArrivalMonth'], row['ArrivalDate'], row['ArrivalYear']), axis=1)
    
    df['TotalNights'] = df['NumWeekendNights'] + df['NumWeekNights']

    df['TotalGuests'] = df['NumAdults'] + df['NumChildren']
    
    df['TotalPrice'] = df['AvgRoomPrice'] * df['TotalNights']


In [88]:
# preprocessing(df)
df

Unnamed: 0,BookingID,LeadTime,ArrivalYear,ArrivalMonth,ArrivalDate,NumWeekendNights,NumWeekNights,MealPlan,Parking,RoomType,NumAdults,NumChildren,MarketSegment,RepeatedGuest,NumPrevCancellations,NumPreviousNonCancelled,AvgRoomPrice,SpecialRequests,BookingStatus
0,1,10,2018,3,31,0,1,1,0,1,1,0,2,0,0,0,95.00,0,0
1,2,116,2018,2,28,2,1,1,0,1,1,0,1,0,0,0,61.00,0,0
2,3,11,2018,7,25,1,2,1,0,1,2,1,1,0,0,0,129.75,1,1
3,4,3,2017,9,12,0,1,1,0,1,2,0,1,0,0,0,152.00,3,1
4,5,28,2018,3,7,1,3,1,0,1,2,0,0,0,0,0,87.00,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29015,29016,2,2018,11,21,1,2,1,0,1,1,0,1,0,0,0,100.67,1,1
29016,29017,18,2018,4,12,0,2,0,0,1,2,0,1,0,0,0,119.00,1,1
29017,29018,8,2017,9,29,0,3,1,0,1,1,0,2,0,0,0,65.00,0,1
29018,29019,20,2018,9,18,2,2,1,0,1,1,0,0,0,0,0,90.00,0,0


In [89]:
# Scale the numerical variables
scaler = StandardScaler()
df[['LeadTime', 'NumWeekendNights', 'NumWeekNights', 'NumAdults', 'NumChildren', 
    'RepeatedGuest', 'NumPrevCancellations', 'NumPreviousNonCancelled', 
    'AvgRoomPrice', 'SpecialRequests']] = scaler.fit_transform(df[['LeadTime', 'NumWeekendNights', 
                                                                     'NumWeekNights', 'NumAdults', 'NumChildren', 
                                                                     'RepeatedGuest', 'NumPrevCancellations', 
                                                                     'NumPreviousNonCancelled', 'AvgRoomPrice', 
                                                                     'SpecialRequests']])

In [90]:
# X_train, X_test, y_train, y_test = train_test_split(df.drop(['BookingStatus','BookingID', 'NumWeekendNights', 'NumWeekNights','NumAdults','NumChildren','AvgRoomPrice','ArrivalMonth','ArrivalDate','ArrivalYear' ], axis=1), df['BookingStatus'], 
#                                                     test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(df.drop(['BookingStatus','BookingID'], axis=1), df['BookingStatus'], 
                                                    test_size=0.3, random_state=42)


In [91]:
# Train the logistic regression model
# lr = LogisticRegression(random_state=42)
lr = RandomForestClassifier(random_state=42)
# lr = DecisionTreeClassifier(random_state=42)
lr.fit(X_train, y_train)

In [92]:
# Evaluate the performance of the model
y_pred = lr.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))


Accuracy: 0.899494601424305
Precision: 0.9117598548573314
Recall: 0.9420586230402181
F1 Score: 0.9266616377503981
Confusion Matrix:
 [[2303  535]
 [ 340 5528]]


In [93]:
importance = lr.feature_importances_

# Sort feature importance in descending order
sorted_importance = sorted(zip(importance, X_train.columns), reverse=True)

# Print feature importance in descending order
for i, f in enumerate(sorted_importance):
    print("{:2d}) {:<30} {:.2%}".format(i + 1, f[1], f[0]))

 1) LeadTime                       31.81%
 2) AvgRoomPrice                   16.05%
 3) SpecialRequests                9.61%
 4) ArrivalDate                    9.10%
 5) ArrivalMonth                   8.19%
 6) NumWeekNights                  5.30%
 7) MarketSegment                  5.11%
 8) NumWeekendNights               3.80%
 9) ArrivalYear                    2.79%
10) NumAdults                      2.52%
11) MealPlan                       2.23%
12) RoomType                       1.72%
13) NumChildren                    0.80%
14) Parking                        0.57%
15) RepeatedGuest                  0.19%
16) NumPreviousNonCancelled        0.19%
17) NumPrevCancellations           0.04%
