In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, RidgeClassifier
# from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [33]:
# Load the data from the CSV file

df = pd.read_csv('data/train_data.csv')

df2 = pd.read_csv('data/test_data.csv')


In [34]:
def dayOfYear(month, day, year):
    daysInMonth = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    dayOfYear = sum(daysInMonth[:int(month)-1]) + int(day)
    if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0) and dayOfYear > 59:
        dayOfYear -= 1
    return dayOfYear

In [35]:
def preprocessing(df):
    df['ArrivalDayOfYear'] = df.apply(lambda row: dayOfYear(row['ArrivalMonth'], row['ArrivalDate'], row['ArrivalYear']), axis=1)
    # df['ArrivalDayOfYear'] = df['ArrivalDayOfYear']/365
    
    df['MealPlan'] = df['MealPlan']*10
     
    df['TotalNights'] = df['NumWeekendNights'] + df['NumWeekNights']

    df['TotalGuests'] = df['NumAdults'] + df['NumChildren']
    
    df['TotalPrice'] = df['AvgRoomPrice'] * df['TotalNights']


In [36]:
# preprocessing(df)
# preprocessing(df2)
df

Unnamed: 0,BookingID,LeadTime,ArrivalYear,ArrivalMonth,ArrivalDate,NumWeekendNights,NumWeekNights,MealPlan,Parking,RoomType,NumAdults,NumChildren,MarketSegment,RepeatedGuest,NumPrevCancellations,NumPreviousNonCancelled,AvgRoomPrice,SpecialRequests,BookingStatus
0,1,10,2018,3,31,0,1,Meal Plan 1,0,Room_Type 1,1,0,Corporate,0,0,0,95.00,0,Canceled
1,2,116,2018,2,28,2,1,Meal Plan 1,0,Room_Type 1,1,0,Online,0,0,0,61.00,0,Canceled
2,3,11,2018,7,25,1,2,Meal Plan 1,0,Room_Type 1,2,1,Online,0,0,0,129.75,1,Not_Canceled
3,4,3,2017,9,12,0,1,Meal Plan 1,0,Room_Type 1,2,0,Online,0,0,0,152.00,3,Not_Canceled
4,5,28,2018,3,7,1,3,Meal Plan 1,0,Room_Type 1,2,0,Offline,0,0,0,87.00,0,Not_Canceled
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29015,29016,2,2018,11,21,1,2,Meal Plan 1,0,Room_Type 1,1,0,Online,0,0,0,100.67,1,Not_Canceled
29016,29017,18,2018,4,12,0,2,Not Selected,0,Room_Type 1,2,0,Online,0,0,0,119.00,1,Not_Canceled
29017,29018,8,2017,9,29,0,3,Meal Plan 1,0,Room_Type 1,1,0,Corporate,0,0,0,65.00,0,Not_Canceled
29018,29019,20,2018,9,18,2,2,Meal Plan 1,0,Room_Type 1,1,0,Offline,0,0,0,90.00,0,Canceled


In [37]:
# Scale the numerical variables
scaler = StandardScaler()
df[['LeadTime', 'NumWeekendNights', 'NumWeekNights', 'NumAdults', 'NumChildren', 
    'RepeatedGuest', 'NumPrevCancellations', 'NumPreviousNonCancelled', 
    'AvgRoomPrice', 'SpecialRequests']] = scaler.fit_transform(df[['LeadTime', 'NumWeekendNights', 
                                                                     'NumWeekNights', 'NumAdults', 'NumChildren', 
                                                                     'RepeatedGuest', 'NumPrevCancellations', 
                                                                     'NumPreviousNonCancelled', 'AvgRoomPrice', 
                                                                     'SpecialRequests']])
df2[['LeadTime', 'NumWeekendNights', 'NumWeekNights', 'NumAdults', 'NumChildren', 
    'RepeatedGuest', 'NumPrevCancellations', 'NumPreviousNonCancelled', 
    'AvgRoomPrice', 'SpecialRequests']] = scaler.fit_transform(df2[['LeadTime', 'NumWeekendNights', 
                                                                     'NumWeekNights', 'NumAdults', 'NumChildren', 
                                                                     'RepeatedGuest', 'NumPrevCancellations', 
                                                                     'NumPreviousNonCancelled', 'AvgRoomPrice', 
                                                                     'SpecialRequests']])

In [38]:
from sklearn.preprocessing import LabelEncoder

categorical_columns = ['MealPlan', 'Parking', 'RoomType', 'MarketSegment', 'BookingStatus']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    
    df2[col] = le.fit_transform(df2[col])
    
df

Unnamed: 0,BookingID,LeadTime,ArrivalYear,ArrivalMonth,ArrivalDate,NumWeekendNights,NumWeekNights,MealPlan,Parking,RoomType,NumAdults,NumChildren,MarketSegment,RepeatedGuest,NumPrevCancellations,NumPreviousNonCancelled,AvgRoomPrice,SpecialRequests,BookingStatus
0,1,-0.876476,2018,3,31,-0.930543,-0.849502,0,0,0,-1.633471,-0.259243,2,-0.16221,-0.064378,-0.087379,-0.241763,-0.788557,0
1,2,0.357733,2018,2,28,1.359612,-0.849502,0,0,0,-1.633471,-0.259243,4,-0.16221,-0.064378,-0.087379,-1.211752,-0.788557,0
2,3,-0.864832,2018,7,25,0.214534,-0.146148,0,0,0,0.295297,2.222853,4,-0.16221,-0.064378,-0.087379,0.749623,0.482631,1
3,4,-0.957980,2017,9,12,-0.930543,-0.849502,0,0,0,0.295297,-0.259243,4,-0.16221,-0.064378,-0.087379,1.384395,3.025008,1
4,5,-0.666893,2018,3,7,0.214534,0.557205,0,0,0,0.295297,-0.259243,3,-0.16221,-0.064378,-0.087379,-0.469996,-0.788557,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29015,29016,-0.969624,2018,11,21,0.214534,-0.146148,0,0,0,-1.633471,-0.259243,4,-0.16221,-0.064378,-0.087379,-0.080003,0.482631,1
29016,29017,-0.783328,2018,4,12,-0.930543,-0.146148,3,0,0,0.295297,-0.259243,4,-0.16221,-0.064378,-0.087379,0.442935,0.482631,1
29017,29018,-0.899763,2017,9,29,-0.930543,0.557205,0,0,0,-1.633471,-0.259243,2,-0.16221,-0.064378,-0.087379,-1.097635,-0.788557,1
29018,29019,-0.760041,2018,9,18,1.359612,-0.146148,0,0,0,-1.633471,-0.259243,3,-0.16221,-0.064378,-0.087379,-0.384408,-0.788557,0


In [39]:
# X_train, X_test, y_train, y_test = train_test_split(df.drop(['BookingStatus','BookingID', 'NumWeekendNights', 'NumWeekNights','NumAdults','NumChildren','AvgRoomPrice','ArrivalMonth','ArrivalDate','ArrivalYear' ], axis=1), df['BookingStatus'], 
#                                                     test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(df.drop(['BookingStatus','BookingID'], axis=1), df['BookingStatus'], 
                                                    test_size=0.2, random_state=42)

# for better result change the test_size to 0.0001

In [40]:
# Train the logistic regression model

# Ideal model and parameters for optimized accuracy
rtc = RandomForestClassifier(
    max_features=0.4,
    max_depth=None,
    min_samples_leaf=1,
    criterion='gini',
    random_state=42,
    n_estimators=1000,
    n_jobs=-1,
    )

# lr = LogisticRegression(random_state=42)
# lr = LinearRegression()
# lr = DecisionTreeClassifier(random_state=42)
# lr = KNeighborsClassifier()
# lr = GradientBoostingClassifier(random_state=42)
# lr = RidgeClassifier()
# lr.fit(X_train, y_train)
rtc.fit(X_train, y_train)

In [41]:
# Evaluate the performance of the model
# y_pred = lr.predict(X_test)

# Change the threshold of accepted data
y_pred = rtc.predict_proba(X_test)[:,1] >= 0.479
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))


Accuracy: 0.9102343211578222
Precision: 0.9150550795593635
Recall: 0.9555214723926381
F1 Score: 0.9348505689633612
Confusion Matrix:
 [[1545  347]
 [ 174 3738]]


In [42]:
importance = rtc.feature_importances_

# Sort feature importance in descending order
sorted_importance = sorted(zip(importance, X_train.columns), reverse=True)

# Print feature importance in descending order
for i, f in enumerate(sorted_importance):
    print("{:2d}) {:<30} {:.2%}".format(i + 1, f[1], f[0]))

 1) LeadTime                       33.17%
 2) AvgRoomPrice                   16.44%
 3) SpecialRequests                9.52%
 4) ArrivalDate                    8.69%
 5) ArrivalMonth                   7.73%
 6) MarketSegment                  6.13%
 7) NumWeekNights                  5.06%
 8) NumWeekendNights               3.70%
 9) NumAdults                      2.51%
10) ArrivalYear                    2.49%
11) MealPlan                       1.59%
12) RoomType                       1.49%
13) NumChildren                    0.68%
14) Parking                        0.57%
15) RepeatedGuest                  0.11%
16) NumPreviousNonCancelled        0.09%
17) NumPrevCancellations           0.03%


In [48]:
X_df2 = df2.drop(['BookingID', 'BookingStatus'], axis=1)
predicted_booking_status = rtc.predict_proba(X_df2)[:,1] >= 0.479

# for i in range(len(predicted_booking_status)):
#     if predicted_booking_status[i] == 'True':
#         predicted_booking_status[i] = 'Not_Canceled'
#     else:
#         predicted_booking_status[i] = 'Canceled'
# print(predicted_booking_status)

df2['BookingStatus'] = predicted_booking_status

df4 = pd.read_csv('data/test_data.csv')
df4['BookingStatus'] = df2['BookingStatus']
df4
# df4.to_csv('team4new.csv', index=False)

Unnamed: 0,BookingID,LeadTime,ArrivalYear,ArrivalMonth,ArrivalDate,NumWeekendNights,NumWeekNights,MealPlan,Parking,RoomType,NumAdults,NumChildren,MarketSegment,RepeatedGuest,NumPrevCancellations,NumPreviousNonCancelled,AvgRoomPrice,SpecialRequests,BookingStatus
0,29020,211,2018,5,20,0,2,Meal Plan 1,0,Room_Type 1,2,0,Online,0,0,0,100.00,0,False
1,29021,121,2018,7,6,0,4,Meal Plan 1,0,Room_Type 1,3,0,Offline,0,0,0,96.90,1,True
2,29022,30,2018,11,26,2,1,Not Selected,0,Room_Type 1,2,0,Online,0,0,0,88.00,0,False
3,29023,256,2018,6,15,0,2,Meal Plan 2,0,Room_Type 1,2,0,Online,0,0,0,115.00,1,False
4,29024,122,2018,11,25,0,1,Meal Plan 1,0,Room_Type 1,1,0,Corporate,0,0,0,67.00,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7250,36270,65,2018,7,27,0,3,Meal Plan 1,0,Room_Type 6,2,2,Online,0,0,0,177.30,0,False
7251,36271,28,2018,9,18,2,0,Meal Plan 1,0,Room_Type 1,2,0,Online,0,0,0,149.00,1,True
7252,36272,116,2018,2,28,2,1,Meal Plan 1,0,Room_Type 1,1,0,Online,0,0,0,1.00,0,True
7253,36273,148,2018,7,1,2,6,Meal Plan 1,0,Room_Type 1,2,0,Online,0,0,0,98.39,2,True


In [25]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(rtc, f)