In [1]:
#Import ML Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
#loading dataset
df = pd.read_csv("Hotel_bookings_MLdata.csv")
df.head()

Unnamed: 0,Booking_id,Stay_duration,Room_price,Room_service_cost,Total_bill,Price_per_day,Checkin_month,Checkin_dayofweek,Is_weekend,Room_type_Executive,Room_type_Standard,Room_type_Suite
0,1,-0.513717,-1.171814,0.497471,7928,-1.131923,7,2,0,False,True,False
1,2,-0.513717,1.436855,0.225361,37692,1.421637,6,6,1,False,False,True
2,3,-1.014367,1.436855,-1.675954,24043,1.282008,4,2,0,False,False,True
3,4,1.488883,-1.171814,-0.159745,15358,-1.247401,7,5,1,False,True,False
4,5,-0.013067,-1.171814,-0.096329,9413,-1.206413,6,4,0,False,True,False


In [3]:
#Define Features and Target
X = df.drop(columns=["Total_bill", "Booking_id"])
y = df["Total_bill"]

In [4]:
#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

### Linear Regression Model

In [5]:
#Train Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

In [6]:
#Make Predictions
y_pred = model.predict(X_test)


In [7]:
#Model Evaluation
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R² Score:", r2)

MAE: 6057.801865173884
RMSE: 7758.466822764882
R² Score: 0.8718930782707472


The linear regression model predicts total booking revenue with reasonable accuracy. The R² score indicates how much variance in revenue is explained by booking features such as stay duration, room pricing, and room type.

In [8]:
#Model Interpretation
# Feature Importance (Model Explainability)
feature_importance = pd.Series(
    model.coef_,
    index=X.columns
).sort_values(ascending=False)

feature_importance

Room_price             15781.114914
Stay_duration          12640.230272
Room_type_Suite         5770.651161
Room_type_Executive     2906.601854
Room_service_cost       1128.767403
Is_weekend               308.633927
Checkin_month              6.153568
Checkin_dayofweek        -18.913014
Room_type_Standard     -1351.628637
Price_per_day          -3088.747136
dtype: float64

The model shows that room price, stay duration, and premium room categories are the strongest drivers of hotel revenue. Suite and executive room bookings significantly increase total revenue, while standard rooms contribute less. Longer stays and additional room services further boost revenue, indicating that upselling, premium pricing, and long-stay incentives are key revenue optimization strategies.

### Random Forest Model

In [9]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

rf_model.fit(X_train, y_train)

In [10]:
#predict
rf_pred = rf_model.predict(X_test)

In [11]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

rf_mae = mean_absolute_error(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_r2 = r2_score(y_test, rf_pred)

print("Random Forest MAE:", rf_mae)
print("Random Forest RMSE:", rf_rmse)
print("Random Forest R2:", rf_r2)

Random Forest MAE: 6.203379999999992
Random Forest RMSE: 24.027599240768897
Random Forest R2: 0.9999987713122159


### Model Comparison

In [12]:
#Create comparison table
model_comparison = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest"],
    "MAE": [mae, rf_mae],
    "RMSE": [rmse, rf_rmse],
    "R2 Score": [r2, rf_r2]
}, index=[1, 2])

model_comparison


Unnamed: 0,Model,MAE,RMSE,R2 Score
1,Linear Regression,6057.801865,7758.466823,0.871893
2,Random Forest,6.20338,24.027599,0.999999


In [13]:
#Feature importance
rf_feature_importance = pd.Series(
    rf_model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

rf_feature_importance


Stay_duration          4.719633e-01
Price_per_day          3.702745e-01
Room_price             1.131281e-01
Room_type_Executive    1.970742e-02
Room_type_Suite        1.790444e-02
Room_type_Standard     6.259280e-03
Room_service_cost      7.627157e-04
Checkin_month          7.109063e-08
Checkin_dayofweek      5.632366e-08
Is_weekend             1.528554e-08
dtype: float64

### Random Forest Interpretation

The Random Forest model identifies **stay duration** and **price per day** as the most influential factors in predicting hotel revenue, indicating that how long guests stay and the effective daily pricing strategy drive revenue more than the base room price alone.

#### Key Insights
- **Stay duration** is the strongest revenue driver, highlighting the importance of length-of-stay optimization.
- **Price per day** plays a critical role in revenue generation through non-linear pricing effects.
- **Room price and premium room types** (Suite, Executive) contribute positively but have a secondary impact.
- **Seasonal and temporal features** (month, day of week, weekend) show minimal influence on revenue.

Overall, the Random Forest results suggest that **dynamic pricing strategies and long-stay incentives** are more effective for revenue maximization than relying solely on room category or seasonality.


### Model Comparison Insight

Linear Regression provides clear interpretability by showing the direction of revenue impact, while Random Forest outperforms it by capturing non-linear relationships and feature interactions. While Linear Regression emphasized room pricing and premium categories, Random Forest revealed that stay duration and daily pricing are the dominant drivers of revenue, making it more suitable for real-world revenue prediction.

***Random Forest was selected as the final model due to its superior performance and ability to capture non-linear relationships in hotel pricing and customer stay behavior.***

In [14]:
import joblib

joblib.dump(rf_model, "revenue_model.pkl")



['revenue_model.pkl']

In [15]:
X.columns


Index(['Stay_duration', 'Room_price', 'Room_service_cost', 'Price_per_day',
       'Checkin_month', 'Checkin_dayofweek', 'Is_weekend',
       'Room_type_Executive', 'Room_type_Standard', 'Room_type_Suite'],
      dtype='object')

In [16]:
import joblib
import os

print("Current working directory:", os.getcwd())

joblib.dump(rf_model, "revenue_model.pkl")
joblib.dump(X.columns.tolist(), "model_features.pkl")

print("Files saved:", os.listdir())



Current working directory: C:\Users\priya\Downloads
Files saved: ['(1) Priyanka P _ LinkedIn_files', '.ipynb_checkpoints', '1.Data_Generation.ipynb', '1000--Car-Crash-Gameplay-Reels-Bundle-2025-30-10-04-43-51.pdf', '1000060907.jpg', '1000061070.jpg', '109106166.pdf', '191011110307-02-australia-beautiful-places.webp', '2.Data Manipulation Using Pandas.ipynb', '3.Data_cleaning.ipynb', '3288858509608305561_60020734901.mp4', '4.Exploratory_data_analysis.ipynb', '5.Feature Engineering-Copy1.ipynb', '5.Feature Engineering.ipynb', '6.Model_Evaluation.ipynb', '711592668-pdfcoffee-com-netzwerk-deutsch-als-fremdsprache-a1-2-pdf-free.pdf', 'Aadhaar SUDHARSHAN.pdf', 'aadhar pri.pdf', 'aadhar pri_compressed.pdf', 'aadhar pri_compressed_page-0001.jpg', 'aadhar pri_page-0001.jpg', 'AI learning', 'AIML resume (1).pdf', 'AIML resume (2).pdf', 'AIML resume.pdf', 'AimlPriya.pdf', 'AmnresMLpri (1).pdf', 'AmnresMLpri.pdf', 'anaconda-navigator-2.6.6-py39haa95532_2.tar.bz2', 'Anaconda3-2025.06-0-Windows-x86_