In [211]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [212]:
# Load the data from the CSV file
df = pd.read_csv('train_data_mod.csv')
# df = pd.read_csv('originalData/train_data.csv')


In [213]:
def dayOfYear(month, day, year):
    daysInMonth = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    dayOfYear = sum(daysInMonth[:int(month)-1]) + int(day)
    if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0) and dayOfYear > 59:
        dayOfYear -= 1
    return dayOfYear

def preprocessing(df):
    
    df['ArrivalDayOfYear'] = df.apply(lambda row: dayOfYear(row['ArrivalMonth'], row['ArrivalDate'], row['ArrivalYear']), axis=1)
    
    df['TotalNights'] = df['NumWeekendNights'] + df['NumWeekNights']

    df['TotalGuests'] = df['NumAdults'] + df['NumChildren']
    
    df['TotalPrice'] = df['AvgRoomPrice'] * df['TotalNights']


In [214]:
preprocessing(df)
df

Unnamed: 0,BookingID,LeadTime,ArrivalYear,ArrivalMonth,ArrivalDate,NumWeekendNights,NumWeekNights,MealPlan,Parking,RoomType,...,RepeatedGuest,NumPrevCancellations,NumPreviousNonCancelled,AvgRoomPrice,SpecialRequests,BookingStatus,ArrivalDayOfYear,TotalNights,TotalGuests,TotalPrice
0,1,10,2018,3,31,0,1,1,0,1,...,0,0,0,95.00,0,0,90,1,1,95.00
1,2,116,2018,2,28,2,1,1,0,1,...,0,0,0,61.00,0,0,59,3,1,183.00
2,3,11,2018,7,25,1,2,1,0,1,...,0,0,0,129.75,1,1,206,3,3,389.25
3,4,3,2017,9,12,0,1,1,0,1,...,0,0,0,152.00,3,1,255,1,2,152.00
4,5,28,2018,3,7,1,3,1,0,1,...,0,0,0,87.00,0,1,66,4,2,348.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29015,29016,2,2018,11,21,1,2,1,0,1,...,0,0,0,100.67,1,1,325,3,1,302.01
29016,29017,18,2018,4,12,0,2,0,0,1,...,0,0,0,119.00,1,1,102,2,2,238.00
29017,29018,8,2017,9,29,0,3,1,0,1,...,0,0,0,65.00,0,1,272,3,1,195.00
29018,29019,20,2018,9,18,2,2,1,0,1,...,0,0,0,90.00,0,0,261,4,1,360.00


In [215]:
# Scale the numerical variables
scaler = StandardScaler()
df[['LeadTime', 'NumWeekendNights', 'NumWeekNights', 'NumAdults', 'NumChildren', 
    'RepeatedGuest', 'NumPrevCancellations', 'NumPreviousNonCancelled', 
    'AvgRoomPrice', 'SpecialRequests']] = scaler.fit_transform(df[['LeadTime', 'NumWeekendNights', 
                                                                     'NumWeekNights', 'NumAdults', 'NumChildren', 
                                                                     'RepeatedGuest', 'NumPrevCancellations', 
                                                                     'NumPreviousNonCancelled', 'AvgRoomPrice', 
                                                                     'SpecialRequests']])

In [216]:
# X_train, X_test, y_train, y_test = train_test_split(df.drop(['BookingStatus','BookingID', 'NumWeekendNights', 'NumWeekNights','NumAdults','NumChildren','AvgRoomPrice','ArrivalMonth','ArrivalDate','ArrivalYear' ], axis=1), df['BookingStatus'], 
#                                                     test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(df.drop(['BookingStatus','BookingID','MealPlan'], axis=1), df['BookingStatus'], 
                                                    test_size=0.01, random_state=42)


In [217]:
# Train the logistic regression model
# lr = LogisticRegression(random_state=42)
lr = RandomForestClassifier(random_state=42)
# lr = DecisionTreeClassifier(random_state=42)
lr.fit(X_train, y_train)

In [218]:
# Evaluate the performance of the model
y_pred = lr.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))


Accuracy: 0.9278350515463918
Precision: 0.9371980676328503
Recall: 0.9603960396039604
F1 Score: 0.9486552567237163
Confusion Matrix:
 [[ 76  13]
 [  8 194]]


In [219]:
importance = lr.feature_importances_

# Sort feature importance in descending order
sorted_importance = sorted(zip(importance, X_train.columns), reverse=True)

# Print feature importance in descending order
for i, f in enumerate(sorted_importance):
    print("{:2d}) {:<30} {:.2%}".format(i + 1, f[1], f[0]))

 1) LeadTime                       27.60%
 2) AvgRoomPrice                   12.09%
 3) TotalPrice                     9.97%
 4) SpecialRequests                9.59%
 5) ArrivalDayOfYear               9.18%
 6) ArrivalDate                    6.31%
 7) MarketSegment                  4.61%
 8) ArrivalMonth                   4.52%
 9) TotalNights                    2.78%
10) NumWeekNights                  2.73%
11) ArrivalYear                    2.58%
12) NumWeekendNights               2.27%
13) TotalGuests                    1.74%
14) NumAdults                      1.40%
15) RoomType                       1.29%
16) Parking                        0.56%
17) NumChildren                    0.47%
18) NumPreviousNonCancelled        0.15%
19) RepeatedGuest                  0.15%
20) NumPrevCancellations           0.03%
