In [1]:
import warnings
warnings.filterwarnings('ignore')

import pycountry
import pickle
import folium
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import missingno as msno

from joblib import dump, load
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from folium.plugins import HeatMap

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
dataset_path = './data/lumen/train.parquet'
df = pd.read_parquet(dataset_path)

In [3]:
date_columns = ['stay_date', 'reservation_date', 'date_from', 'date_to', 'cancel_date']

for column in date_columns:
    df[column] = pd.to_datetime(df[column], errors='coerce')

In [4]:
tax_columns = [col for col in df.columns if 'tax' in col]
df.drop(tax_columns, axis=1, inplace=True)

In [5]:
knn_imputer = KNNImputer(n_neighbors=5)
columns_to_impute = ['price', 'total_price', 'food_price', 'other_price']
df[columns_to_impute] = knn_imputer.fit_transform(df[columns_to_impute])

In [6]:
filter = (df.children_cnt == 0) & (df.adult_cnt == 0)
df = df[~filter]
df = df[~(df['reservation_date'] > df['date_from'])]
df = df[~(df['date_from'] < df['cancel_date'])]

In [7]:
# Extract day of the week (Monday=0, Sunday=6)
df['stay_day_of_week'] = df['stay_date'].dt.dayofweek
# Extract month
df['stay_month'] = df['stay_date'].dt.month
# Extract year
df['stay_year'] = df['stay_date'].dt.year
# Extract day of the month
df['stay_day_of_month'] = df['stay_date'].dt.day
# Check if it's a weekend day (Saturday or Sunday)
df['stay_is_weekend'] = df['stay_date'].dt.dayofweek > 4
# Quarter of the year
df['stay_quarter'] = df['stay_date'].dt.quarter
# Week of the year
df['stay_week_of_year'] = df['stay_date'].dt.isocalendar().week

In [8]:
le = LabelEncoder()

df['guest_country_id_encoded'] = le.fit_transform(df['guest_country_id'])
df['reservation_status_encoded'] = le.fit_transform(df['reservation_status'])
df = df.drop(columns=['guest_country_id', 'reservation_status', 'resort_id'])

In [9]:
df['lead_time'] = (df['date_from'] - df['reservation_date']).dt.days
df['total_guests'] = df['children_cnt'] + df['adult_cnt']

In [10]:
useless_col = ['adult_cnt', 'stay_year', 'children_cnt', 'food_price', 'other_price', 'price', 'reservation_id', 'guest_id']
df.drop(useless_col, axis = 1, inplace = True)

In [11]:
df['reservation_status_encoded'] = df['reservation_status_encoded'].apply(lambda x: 0 if x in [0, 2] else x)

In [12]:
df['stay_is_weekend'] = df['stay_is_weekend'].astype(int)

In [13]:
filtered_df = df[df['reservation_status_encoded'] == 1]

In [14]:
aggregations = {
    'room_cnt': [
        ('total_rooms', 'sum'),
        ('rooms_reserved_that_day', lambda x: (filtered_df.loc[x.index, 'reservation_date'] == filtered_df.loc[x.index, 'stay_date']).sum())
    ],
    'total_price': [('average_room_price','mean')],
    'lead_time': [('average_lead_time','mean')],
    'room_category_id': [('room_category', lambda x: x.mode()[0] if not x.mode().empty else None)],
    'sales_channel_id': [('sales_channel_mode', lambda x: x.mode()[0] if not x.mode().empty else None)],
    'stay_day_of_week': [('stay_day_of_week', 'min')],
    'stay_month': [('stay_of_month', 'min')],
    'stay_day_of_month': [('stay_day_of_month', 'min')],
    'stay_is_weekend': [('stay_is_weekend', 'min')],
    'stay_quarter': [('stay_quarter', 'min')],
    'stay_week_of_year': [('stay_week_of_year', 'min')],
}

grouped = filtered_df.groupby('stay_date').agg(aggregations)
grouped.columns = grouped.columns.droplevel(0)

In [15]:
grouped = grouped.sort_index()

for col in ['rooms_reserved_that_day', 'total_rooms', 'average_room_price', 'average_lead_time', 'room_category', 'sales_channel_mode']:
    grouped[f'{col}_lag1'] = grouped[col].shift(1)

In [16]:
grouped = grouped.reset_index()
grouped.rename(columns={'index': 'stay_date'}, inplace=True)

In [17]:
train_data = grouped[grouped['stay_date'] < '2009-03-01']
test_data = grouped[grouped['stay_date'] >= '2009-03-01']

In [18]:
selected_features = ['stay_day_of_week', 'total_rooms_lag1', 'stay_is_weekend', 'stay_week_of_year', 'rooms_reserved_that_day_lag1']

X_train = train_data[selected_features]
y_train = train_data['rooms_reserved_that_day']

X_test = test_data[selected_features]
y_test = test_data['rooms_reserved_that_day']

test_dates = test_data['stay_date']

In [19]:
model = RandomForestRegressor(n_estimators=700, max_depth=6, min_samples_leaf=2, random_state=42)
model.fit(X_train, y_train)

dump(model, 'random_forest_regressor.joblib')

predictions = model.predict(X_test)

predicted_rooms_df = pd.DataFrame({
    'stay_date': test_dates,  # Reattach the saved stay_date here
    'predicted_number_of_rooms_that_day': predictions
})
predicted_rooms_df.sort_values(by='stay_date', inplace=True)
print(predicted_rooms_df.head())

     stay_date  predicted_number_of_rooms_that_day
427 2009-03-01                            4.741856
428 2009-03-02                            3.593567
429 2009-03-03                            5.668286
430 2009-03-04                            7.482438
431 2009-03-05                            7.532631


In [20]:
rmse = mean_squared_error(y_test, predictions, squared=False)
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 2.9410512965429825


In [21]:
results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': predictions
})

def safe_mean_absolute_percentage_error(actual, predicted):
    actual, predicted = np.array(actual), np.array(predicted)
    non_zero_actual = actual != 0  # Create a mask for non-zero actual values
    actual_filtered = actual[non_zero_actual]
    predicted_filtered = predicted[non_zero_actual]
    return np.mean(np.abs((actual_filtered - predicted_filtered) / actual_filtered)) * 100

In [22]:
safe_mape = safe_mean_absolute_percentage_error(results_df['Actual'], results_df['Predicted'])
print(f"Safe Mean Absolute Percentage Error (MAPE): {safe_mape:.2f}%")

Safe Mean Absolute Percentage Error (MAPE): 63.20%


In [23]:
residuals = y_test - predictions

sigma = np.std(residuals)

print("Estimated standard deviation of the prediction errors (sigma):", sigma)

Estimated standard deviation of the prediction errors (sigma): 2.930418955558022
