In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Hotel_bookings_cleaned.csv")
df.head()

Unnamed: 0,Booking_id,Room_type,Check_in_date,Check_out_date,Stay_duration,Room_price,Room_service_cost,Total_bill,Price_per_day
0,1,Standard,2025-07-23,2025-07-26,3,2000,1928,7928,2642.666667
1,2,Suite,2024-06-30,2024-07-03,3,12000,1692,37692,12564.0
2,3,Suite,2025-04-30,2025-05-02,2,12000,43,24043,12021.5
3,4,Standard,2024-07-27,2024-08-03,7,2000,1358,15358,2194.0
4,5,Standard,2024-06-14,2024-06-18,4,2000,1413,9413,2353.25


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Booking_id         10000 non-null  int64  
 1   Room_type          10000 non-null  object 
 2   Check_in_date      10000 non-null  object 
 3   Check_out_date     10000 non-null  object 
 4   Stay_duration      10000 non-null  int64  
 5   Room_price         10000 non-null  int64  
 6   Room_service_cost  10000 non-null  int64  
 7   Total_bill         10000 non-null  int64  
 8   Price_per_day      10000 non-null  float64
dtypes: float64(1), int64(5), object(3)
memory usage: 703.3+ KB


In [4]:
# Total revenue per booking
target = "Total_bill"

Create Date-Based Features

In [5]:
# seasonality & demand behavior
df["Check_in_date"] = pd.to_datetime(df["Check_in_date"])

df["Checkin_month"] = df["Check_in_date"].dt.month
df["Checkin_dayofweek"] = df["Check_in_date"].dt.dayofweek
df["Is_weekend"] = df["Checkin_dayofweek"].isin([5, 6]).astype(int)

Encode Categorical Features

In [6]:
df = pd.get_dummies(df, columns=["Room_type"], drop_first=True)

In [7]:
"Room_type" in df.columns

False

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Booking_id           10000 non-null  int64         
 1   Check_in_date        10000 non-null  datetime64[ns]
 2   Check_out_date       10000 non-null  object        
 3   Stay_duration        10000 non-null  int64         
 4   Room_price           10000 non-null  int64         
 5   Room_service_cost    10000 non-null  int64         
 6   Total_bill           10000 non-null  int64         
 7   Price_per_day        10000 non-null  float64       
 8   Checkin_month        10000 non-null  int32         
 9   Checkin_dayofweek    10000 non-null  int32         
 10  Is_weekend           10000 non-null  int64         
 11  Room_type_Executive  10000 non-null  bool          
 12  Room_type_Standard   10000 non-null  bool          
 13  Room_type_Suite      10000 non-n

In [9]:
df["Check_in_date"] = pd.to_datetime(df["Check_in_date"])
df["Check_out_date"] = pd.to_datetime(df["Check_out_date"])

In [10]:
df["Checkin_month"] = df["Check_in_date"].dt.month
df["Checkin_dayofweek"] = df["Check_in_date"].dt.dayofweek
df["Is_weekend"] = df["Checkin_dayofweek"].isin([5, 6]).astype(int)

In [11]:
df = df.drop(columns=["Check_in_date", "Check_out_date"])

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Booking_id           10000 non-null  int64  
 1   Stay_duration        10000 non-null  int64  
 2   Room_price           10000 non-null  int64  
 3   Room_service_cost    10000 non-null  int64  
 4   Total_bill           10000 non-null  int64  
 5   Price_per_day        10000 non-null  float64
 6   Checkin_month        10000 non-null  int32  
 7   Checkin_dayofweek    10000 non-null  int32  
 8   Is_weekend           10000 non-null  int64  
 9   Room_type_Executive  10000 non-null  bool   
 10  Room_type_Standard   10000 non-null  bool   
 11  Room_type_Suite      10000 non-null  bool   
dtypes: bool(3), float64(1), int32(2), int64(6)
memory usage: 654.4 KB


In [13]:
numeric_cols = [
    "Stay_duration",
    "Room_price",
    "Room_service_cost",
    "Price_per_day"
]

Apply scaling

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


Define Features (X) and Target (y)

In [15]:
X = df.drop(columns=["Total_bill", "Booking_id"])
y = df["Total_bill"]

X.shape, y.shape

((10000, 10), (10000,))

Train-Test Split

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

Feature engineering prepared the hotel booking dataset for machine learning by encoding categorical variables, extracting temporal patterns, scaling numerical features, and separating predictors from the target variable to ensure robust and unbiased model training.

In [17]:
df.to_csv("Hotel_bookings_MLdata.csv", index=False)

In [18]:
#Import ML Libraries
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [19]:
model = LinearRegression()
model.fit(X_train, y_train)


In [20]:
#Make Predictions
y_pred = model.predict(X_test)


In [21]:
#Evaluate Model Performance
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R² Score:", r2)


MAE: 6057.801865173884
RMSE: 7758.466822764883
R² Score: 0.8718930782707472


A linear regression model was trained to predict total booking revenue. Model performance was evaluated using MAE, RMSE, and R² score to measure prediction accuracy and variance explanation.

Feature Importance (Explainability)

In [22]:
feature_importance = pd.Series(
    model.coef_,
    index=X.columns
).sort_values(ascending=False)

feature_importance
#Features such as stay duration, room price, and room type have the strongest influence on total revenue.

Room_price             15781.114914
Stay_duration          12640.230272
Room_type_Suite         5770.651161
Room_type_Executive     2906.601854
Room_service_cost       1128.767403
Is_weekend               308.633927
Checkin_month              6.153568
Checkin_dayofweek        -18.913014
Room_type_Standard     -1351.628637
Price_per_day          -3088.747136
dtype: float64

In [23]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)

print("Random Forest R²:", r2_score(y_test, rf_pred))


Random Forest R²: 0.9999979874272079
